This script is a template workflow for scoring Qualtrics data using the scorequaltrics package built by John Flournoy and is a pared down version of the tutorial he created for the TDS study.
To pull data from Qualtrics, you need a credentials file with an API token associated with your account. To create the file, follow these steps.
Generate an API token for Qualtrics. Follow the steps outlined here
Create qualtrics_credentials.yaml in the credentialDir and add API token information
credentialDir='/Users/danicosme/' #replace with your path
if [ ! -f ${credentialDir}qualtrics_credentials.yaml ]; then
cd ${credentialDir}
touch qualtrics_credentials.yaml
echo "token: Ik0XNN...." >> qualtrics_credentials.yaml #replace with your token information
echo "baseurl: oregon.qualtrics.com" >> qualtrics_credentials.yaml
echo "credential file created"
else
echo "credential file already exists in this location"
fi## credential file already exists in this location
if (!require(tidyverse)) {
install.packages('tidyverse')
}
if (!require(knitr)) {
install.packages('knitr')
}
if (!require(devtools)) {
install.packages('devtools')
}
if (!require(scorequaltrics)) {
devtools::install_github('dcosme/qualtrics', ref = "dev/enhance")
}
if (!require(ggcorrplot)) {
install.packages('ggcorrplot')
}cred_file_location = path to your Qualtrics credential file. You’ll need to generate this via Qualtrics using the instructions above.keep_columns = subject ID column name and any other columns in Qualtrics survey you want to keep in wide format (all others will be gathered into a key-value pair); can be a regular expressionsurvey_name_filter = regular expression to select surveyssid_pattern = regular expression for participant IDsexclude_sid = regular expression for participant IDs to exclude (e.g. test responses)identifiable_data = identifiable data you do not want to include in the dataframeoutput_file_dir = output file directoryrubric_dir = scoring rubric directorycred_file_location = '~/qualtrics_credentials.yaml'
keep_columns = '(Login|ResponseId|Finished)'
survey_name_filter = 'DEV .* Surveys'
sid_pattern = 'DEV[0-9]{3}$'
exclude_sid = '^99|DEV999|DEV000|DEV998|DEV737' # subject IDs to exclude
identifiable_data = c('IPAddress', "RecipientEmail", "RecipientLastName", "RecipientFirstName",
"LocationLatitude", "LocationLongitude") # exclude when printing duplicates
output_file_dir = '~/Documents/code/score-qualtrics'
rubric_dir = '~/Documents/code/score-qualtrics/rubrics'Filter available surveys based on the filter specified above.
# load credential file
credentials = scorequaltrics::creds_from_file(cred_file_location)
# filter
surveysAvail = scorequaltrics::get_surveys(credentials)
surveysFiltered = filter(surveysAvail, grepl(survey_name_filter, SurveyName))
knitr::kable(arrange(select(surveysFiltered, SurveyName), SurveyName))| SurveyName |
|---|
| DEV Session 0 Surveys |
| DEV Session 1 Surveys |
| DEV Session 2 Surveys |
| DEV Session 3 Surveys |
| DEV Session 4 Surveys |
| DEV Session 5 Surveys |
The get_survey_data function pulls the data from the surveys specified in surveysFiltered and reshapes into the long format. Because the example data also includes some identifying information, we also want to filter those items out of our dataframe.
To automatically score the surveys, scoring rubrics with the following format must be provided:
Required columns
scale name = name of the scalecolumn name = item name used in Qualtricsreverse = reverse scoring flag (1 = yes, 0 = no)min = minimum value for numeric itemsmax = maximum value for numeric itemstransform = transformation function; use 0 for all items as we’re not transforming the data during scoringUser-generated column names
Item values in user-created columns
Scoring rubrics should exist in rubric_dir and be named according to the following convention: [measure]_scoring_rubric.csv
# specify rubric paths
scoring_rubrics = data.frame(file = dir(file.path(rubric_dir),
pattern = '.*scoring_rubric.*.csv',
full.names = TRUE))
# read in rubrics
scoring_data_long = scorequaltrics::get_rubrics(scoring_rubrics,
type = 'scoring')
# print the first 10 rows
head(scoring_data_long[, -1], 10)First, check participant IDs that don’t match the participant ID regular expression pattern.
Tidy incorrectly formatted participant IDs and exclude responses that are not subject responses.
# print incorrectly formatted IDs
surveys_long %>%
select(SID) %>%
unique() %>%
filter(!grepl(sid_pattern, SID)) %>%
mutate(SID_new = gsub("Dev", "DEV", SID),
SID_new = gsub("dev", "DEV", SID_new),
SID_new = gsub("DEVI", "DEV", SID_new),
SID_new = gsub("DEVl", "DEV", SID_new),
SID_new = gsub("DEVo", "DEV", SID_new),
SID_new = ifelse(grepl("^[0-9]{3}$", SID_new), paste0("DEV", SID_new), SID_new),
SID_new = ifelse(grepl("DEV[0-9]{4}", SID_new), gsub("DEV0", "DEV", SID_new), SID_new),
SID_new = ifelse(SID == "DEVO55", "DEV055", SID_new)) %>%
arrange(SID_new)# updated IDs in the survey data
surveys_sub = surveys_long %>%
mutate(SID = gsub("Dev", "DEV", SID),
SID = gsub("dev", "DEV", SID),
SID = gsub("DEVI", "DEV", SID),
SID = gsub("DEVl", "DEV", SID),
SID = gsub("DEVo", "DEV", SID),
SID = ifelse(grepl("^[0-9]{3}$", SID), paste0("DEV", SID), SID),
SID = ifelse(grepl("DEV[0-9]{4}", SID), gsub("DEV0", "DEV", SID), SID),
SID = ifelse(SID == "DEVO55", "DEV055", SID)) %>%
filter(grepl(sid_pattern, SID)) %>%
filter(!grepl(exclude_sid, SID))
# print unique SIDs
unique(sort(surveys_sub$SID))## [1] "DEV001" "DEV002" "DEV004" "DEV005" "DEV006" "DEV007" "DEV008" "DEV009"
## [9] "DEV010" "DEV011" "DEV012" "DEV013" "DEV014" "DEV015" "DEV016" "DEV017"
## [17] "DEV018" "DEV019" "DEV020" "DEV021" "DEV022" "DEV023" "DEV024" "DEV025"
## [25] "DEV026" "DEV027" "DEV028" "DEV029" "DEV030" "DEV031" "DEV032" "DEV033"
## [33] "DEV034" "DEV035" "DEV036" "DEV037" "DEV038" "DEV039" "DEV040" "DEV041"
## [41] "DEV042" "DEV043" "DEV044" "DEV045" "DEV046" "DEV047" "DEV048" "DEV049"
## [49] "DEV050" "DEV051" "DEV052" "DEV053" "DEV054" "DEV055" "DEV056" "DEV057"
## [57] "DEV058" "DEV059" "DEV060" "DEV061" "DEV062" "DEV063" "DEV064" "DEV065"
## [65] "DEV066" "DEV067" "DEV068" "DEV069" "DEV070" "DEV071" "DEV072" "DEV073"
## [73] "DEV074" "DEV075" "DEV076" "DEV077" "DEV078" "DEV079" "DEV080" "DEV081"
## [81] "DEV082" "DEV083" "DEV084" "DEV085" "DEV086" "DEV087" "DEV088" "DEV089"
## [89] "DEV090" "DEV091" "DEV092" "DEV093" "DEV094" "DEV095" "DEV096" "DEV097"
## [97] "DEV098" "DEV099" "DEV100" "DEV101" "DEV102" "DEV103" "DEV104" "DEV105"
## [105] "DEV106" "DEV107" "DEV108" "DEV109" "DEV110" "DEV111" "DEV112" "DEV113"
## [113] "DEV114" "DEV115" "DEV116" "DEV117" "DEV118" "DEV119" "DEV120" "DEV121"
## [121] "DEV122" "DEV123" "DEV124" "DEV125" "DEV126" "DEV127" "DEV128" "DEV129"
## [129] "DEV130" "DEV132" "DEV133" "DEV134" "DEV135" "DEV136" "DEV137" "DEV138"
## [137] "DEV139" "DEV140" "DEV141" "DEV142" "DEV143" "DEV144" "DEV145" "DEV146"
## [145] "DEV147" "DEV148" "DEV149" "DEV150" "DEV151" "DEV152" "DEV153" "DEV154"
## [153] "DEV155" "DEV156" "DEV157" "DEV158" "DEV159" "DEV161" "DEV162" "DEV163"
## [161] "DEV164" "DEV165" "DEV167" "DEV168" "DEV169" "DEV170" "DEV171" "DEV172"
## [169] "DEV173" "DEV174" "DEV175" "DEV176" "DEV178" "DEV179" "DEV181" "DEV182"
## [177] "DEV183" "DEV184" "DEV185" "DEV186" "DEV187" "DEV189" "DEV198" "DEV208"
## [185] "DEV215" "DEV239" "DEV245" "DEV249" "DEV250" "DEV264" "DEV280"
Convert missing values to NA.
Check for non-numeric items using the get_uncoercibles() function.
surveys_long_na %>%
scorequaltrics::get_uncoercibles() %>%
distinct(item, value) %>%
arrange(item) %>%
head(., 10)Make manual edits before converting values to numeric during scoring.
Here’s an example from anohter survey, but we’ll skip this for the DEV example since there’s nothing to modify.
# save ethnicity information as a separate variable
CVS_3 = surveys_long_na %>%
mutate(value = ifelse(item == "CVS_3", tolower(value), value)) %>%
filter(item == "CVS_3")
# make manual edits and convert values to numeric
surveys_long_man = surveys_long_na %>%
mutate(value = ifelse(SID == "FP007" & item == "CVS_1", "18",
ifelse(SID == "FP006" & item == "CVS_15", "3.47",
ifelse(SID == "FP002" & item == "CVS_16", "3",
ifelse(SID == "FP006" & item == "CVS_16", "3.7", value)))))Check for duplicate responses. There is a clean_dupes function that can do this, but since we have multiple waves with the same surveys, we’re going to do this homebrew.
(duplicates = surveys_long_na %>%
spread(item, value) %>%
group_by(survey_name, SID) %>%
summarize(n = n()) %>%
arrange(desc(n)) %>%
filter(n > 1) %>%
mutate(survey_SID = sprintf("%s_%s", survey_name, SID)))For each participant, determine which survey to use and filter out the others using ResponseId.
Select the survey responses with the least missing data, or select the last survey if n_missing is equal.
NOTE: Should also verify that these are duplicate responses and not accidental responses to the wrong wave.
# calculate the number of missing responses per survey
(n_missing = surveys_long_na %>%
mutate(survey_SID = sprintf("%s_%s", survey_name, SID)) %>%
filter(survey_SID %in% duplicates$survey_SID) %>%
filter(is.na(value)) %>%
group_by(survey_SID, ResponseId) %>%
summarize(n_missing = n()))# filter out the responses selected to use in scoring
exclude_response_ids = n_missing %>%
filter(!ResponseId %in% c("R_2OIHE4ktdYcm0a7", "R_BEf3vfYhagbnNUl", "R_2TBslmkLEmzJhQO",
"R_1Cr7yJZzyXziAXg", "R_3EMSgAcGdBy5lE6", "R_9tv4Lh4mmYBQaY1",
"R_vGDsF1Xq6wdXCGB", "R_3nAVlaBwD9kU9xP", "R_AyxxcgRp9hFm2Zj",
"R_24EEWwArG7Nrk77", "R_zfqhMGfThNTHGx3", "R_t03olQ4jM7h4xhL",
"R_3lGitKxAqTI9ri8", "R_pb093Br4yxEDbPP", "R_3rNTwD6P0nqzZVf",
"R_3mkjdwllly6g0Gy", "R_9MJVzMgBmdjslyx", "R_336XpIG9yfTL6zS",
"R_beGVZ2QKu7GcxZT", "R_2S7vv3fDokVJwYI", "R_3qrFrS76kW2BhML",
"R_UFN8NOCErVF0fVD", "R_2ZUHiwY8M0s6Wbx", "R_5mQ02A8ywtBVv45",
"R_3mjH74cQsiSOJ5A", "R_sZ2ohYJtIajBDXP", "R_1gNrV5xUzrtLY3D",
"R_w0LUkf7sF5c4eop", "R_3LkiPMqFoXZaP54", "R_1ot2j6SlKbTRjkP",
"R_1Kln0VRBOvZJ1cO", "R_1N9YqQ4ewGSeZg9", "R_3jZisXIgjxuaa4h",
"R_31hJJws8nEQUDTG", "R_3Euk7CvKsVqGw8Q", "R_0xH8tsvKPDGW1DX",
"R_23gcbMleuUGfl1X", "R_27DrVFkCdIZvwNn", "R_3PO9Uy5U0RQlZtI",
"R_zZ7CeIl3eP6O6Vb", "R_3M68oO9gYPMrY0V", "R_PHxUCLXK4S0d5uN",
"R_tSetLV0PGrfGS5j", "R_2dEHbphOgFbGwkr", "R_RDnjHxeAMqw40s9",
"R_YRqc1ppx2JqoLGV", "R_3lK38brLaUxPrV5", "R_28SdLIJLFkBxPc7",
"R_2OMT5G2RmSkxqzM"))
# filter out duplicated responses
surveys_long_clean = surveys_long_na %>%
filter(!ResponseId %in% exclude_response_ids$ResponseId) %>%
select(-ResponseId)scored %>%
filter(!method == "I") %>% # filter out non-numeric data
mutate(score = as.numeric(score)) %>%
group_by(scale_name) %>%
do({
plot = ggplot(., aes(scored_scale, score)) +
geom_boxplot() +
geom_jitter(height = .01, width = .15, alpha = .5, color = "#2A908B") +
labs(x = "", y = "score\n", title = sprintf("%s\n", .$scale_name[[1]])) +
theme_minimal(base_size = 16) +
theme(text = element_text(family = "Futura Medium", colour = "black"),
legend.text = element_text(size = 8),
axis.text = element_text(color = "black"),
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(hjust = 0.5))
print(plot)
data.frame()
})scored %>%
filter(!method == "I") %>% # filter out non-numeric data
mutate(score = as.numeric(score)) %>%
group_by(scale_name, scored_scale) %>%
do({
plot = ggplot(., aes(scored_scale, score)) +
geom_boxplot() +
geom_jitter(height = .01, width = .15, alpha = .5, color = "#2A908B") +
labs(x = "", y = "score\n", title = sprintf("%s %s\n", .$scale_name[[1]], .$scored_scale[[1]])) +
theme_minimal(base_size = 16) +
theme(text = element_text(family = "Futura Medium", colour = "black"),
axis.text = element_text(color = "black"),
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(hjust = 0.5))
print(plot)
data.frame()
})scored %>%
filter(!method == "I") %>% # filter out non-numeric data
mutate(score = as.numeric(score)) %>%
group_by(scale_name) %>%
do({
plot = ggplot(., aes(scored_scale, n_missing)) +
geom_violin() +
geom_jitter(height = .01, width = .15, alpha = .5, color = "#2A908B") +
labs(title = sprintf("%s %s\n", .$scale_name[[1]], .$scored_scale[[1]])) +
labs(x = "", y = "score\n") +
theme_minimal(base_size = 16) +
theme(text = element_text(family = "Futura Medium", colour = "black"),
axis.text = element_text(color = "black"),
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(hjust = 0.5))
print(plot)
data.frame()
})For those variables that were measured more than once, plot changes.
scored %>%
filter(!method == "I") %>% # filter out non-numeric data
mutate(score = as.numeric(score)) %>%
extract(survey_name, "wave", ".*([0-9]{1}).*", remove = FALSE) %>%
group_by(scale_name, scored_scale) %>%
mutate(nrow = n()) %>%
filter(nrow > 34) %>%
do({
plot = ggplot(., aes(wave, score)) +
geom_point(aes(group = SID), fill = "black", alpha = .05, size = 3) +
geom_line(aes(group = SID), color = "black", alpha = .05, size = 1) +
stat_summary(fun.data = "mean_cl_boot", size = 1.5, color = "#3B9AB2") +
stat_summary(aes(group = 1), fun.y = mean, geom = "line", size = 1.5, color = "#3B9AB2") +
labs(x = "\nwave", y = "score\n", title = sprintf("%s %s\n", .$scale_name[[1]], .$scored_scale[[1]])) +
theme_minimal(base_size = 16) +
theme(text = element_text(family = "Futura Medium", colour = "black"),
axis.text = element_text(color = "black"),
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
plot.title = element_text(hjust = 0.5))
print(plot)
data.frame()
})scored %>%
filter(!method == "I") %>% # filter out non-numeric data
mutate(score = as.numeric(score)) %>%
extract(survey_name, "wave", ".*([0-9]{1}).*", remove = FALSE) %>%
mutate(var.name = sprintf("%s %s T%s", scale_name, scored_scale, wave)) %>%
ungroup() %>%
select(var.name, score, SID) %>%
spread(var.name, score) %>%
filter(!is.na(SID)) %>%
select(-SID) %>%
cor(., use = "pairwise.complete.obs") %>%
ggcorrplot(hc.order = TRUE, outline.col = "white", colors = c("#3B9AB2", "white", "#E46726")) +
geom_text(aes(label = round(value, 2)), size = 4, family = "Futura Medium") +
labs(x = "", y = "") +
theme_minimal(base_size = 16) +
theme(text = element_text(family = "Futura Medium", colour = "black"),
legend.text = element_text(size = 8),
axis.text = element_text(color = "black"),
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())